regression models
test_df <- test_data |> select(-BMXWAIST)
run_model <- function(formula_str,train_data_set=train_data,
test_data_set=test_df){
# setup regression model
lm_reg = lm(formula = as.formula(formula_str), train_data_set)
print(summary(lm_reg))
# run prediction
lm_pred = predict(lm_reg, newdata = test_df, se = T)
# save prediction results
pred_df = data.frame(
fit = lm_pred$fit,
weight = test_data$BMXWT,
sex = test_data$RIAGENDR,
label = test_data$BMXWAIST
)
# compute MSE
mse<- mean_square_error(pred_df$fit, pred_df$label)
#plot results
g <- ggplot(pred_df, aes(x = weight, y = label)) + geom_point(colour = "black",alpha = 0.1) +
geom_point(aes(x = weight, y = fit, colour = sex,alpha = 0.1),
size = 1.5) + ylab("waist circumference")
g+ggtitle(paste("MSE = ",mse))
}
run_model("BMXWAIST ~ BMXWT")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.7268 -4.9769 -0.2348 4.9041 29.2490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.75908 0.59288 73.81 <2e-16 ***
## BMXWT 0.68446 0.00704 97.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.253 on 2498 degrees of freedom
## Multiple R-squared: 0.7909, Adjusted R-squared: 0.7909
## F-statistic: 9451 on 1 and 2498 DF, p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(BMXWT)")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.1660 -4.7950 -0.2237 4.8580 28.9910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.589 1.020 65.27 <2e-16 ***
## bs(BMXWT)1 38.225 3.036 12.59 <2e-16 ***
## bs(BMXWT)2 72.230 2.532 28.52 <2e-16 ***
## bs(BMXWT)3 90.265 4.069 22.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.208 on 2496 degrees of freedom
## Multiple R-squared: 0.7937, Adjusted R-squared: 0.7934
## F-statistic: 3201 on 3 and 2496 DF, p-value: < 2.2e-16
## Warning in bs(BMXWT, degree = 3L, knots = numeric(0), Boundary.knots = c(37, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + years")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.6713 -4.1404 -0.1093 4.1705 26.5294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.193372 0.872496 42.629 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.337644 1.317056 1.016 0.30990
## bs(RIDAGEYR, df = 7)2 1.362764 0.942823 1.445 0.14847
## bs(RIDAGEYR, df = 7)3 3.322829 1.015300 3.273 0.00108 **
## bs(RIDAGEYR, df = 7)4 5.692912 0.887975 6.411 1.72e-10 ***
## bs(RIDAGEYR, df = 7)5 9.271967 1.138759 8.142 6.08e-16 ***
## bs(RIDAGEYR, df = 7)6 10.340631 1.206769 8.569 < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 12.510724 1.239543 10.093 < 2e-16 ***
## BMXWT 0.726116 0.006218 116.774 < 2e-16 ***
## RIAGENDRMale -4.714737 0.252704 -18.657 < 2e-16 ***
## years2005-2006 -0.217119 0.523583 -0.415 0.67841
## years2007-2008 -0.442658 0.493215 -0.897 0.36954
## years2009-2010 0.332303 0.486776 0.683 0.49489
## years2013-2014 1.029248 0.501634 2.052 0.04029 *
## years2015-2016 1.491400 0.515688 2.892 0.00386 **
## years2017-2018 0.364910 0.505424 0.722 0.47037
## years2022-2012 -0.048247 0.506252 -0.095 0.92408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.045 on 2483 degrees of freedom
## Multiple R-squared: 0.8556, Adjusted R-squared: 0.8547
## F-statistic: 919.7 on 16 and 2483 DF, p-value: < 2.2e-16

run_model("BMXWAIST ~ ns(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + years")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.9563 -4.1284 -0.1112 4.1327 26.5222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.077231 0.793335 46.736 < 2e-16 ***
## ns(RIDAGEYR, df = 7)1 1.196274 0.774614 1.544 0.12263
## ns(RIDAGEYR, df = 7)2 4.899847 0.982551 4.987 6.56e-07 ***
## ns(RIDAGEYR, df = 7)3 4.941640 0.839101 5.889 4.41e-09 ***
## ns(RIDAGEYR, df = 7)4 8.011813 0.862736 9.287 < 2e-16 ***
## ns(RIDAGEYR, df = 7)5 8.597619 0.831967 10.334 < 2e-16 ***
## ns(RIDAGEYR, df = 7)6 13.486953 1.435079 9.398 < 2e-16 ***
## ns(RIDAGEYR, df = 7)7 11.012311 0.824097 13.363 < 2e-16 ***
## BMXWT 0.726016 0.006214 116.828 < 2e-16 ***
## RIAGENDRMale -4.714870 0.252478 -18.674 < 2e-16 ***
## years2005-2006 -0.252181 0.522454 -0.483 0.62936
## years2007-2008 -0.452088 0.489407 -0.924 0.35571
## years2009-2010 0.309566 0.481554 0.643 0.52038
## years2013-2014 1.013463 0.498535 2.033 0.04217 *
## years2015-2016 1.480495 0.511400 2.895 0.00382 **
## years2017-2018 0.338513 0.501048 0.676 0.49935
## years2022-2012 -0.063646 0.501525 -0.127 0.89903
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.041 on 2483 degrees of freedom
## Multiple R-squared: 0.8558, Adjusted R-squared: 0.8549
## F-statistic: 921.1 on 16 and 2483 DF, p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + bs(BMXWT) + RIAGENDR + BMXHT")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.6329 -3.1637 -0.0372 3.3221 23.9405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 132.18669 2.50117 52.850 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.66522 1.10475 1.507 0.13185
## bs(RIDAGEYR, df = 7)2 1.40088 0.79283 1.767 0.07736 .
## bs(RIDAGEYR, df = 7)3 2.71895 0.85240 3.190 0.00144 **
## bs(RIDAGEYR, df = 7)4 4.67823 0.74466 6.282 3.92e-10 ***
## bs(RIDAGEYR, df = 7)5 8.18036 0.95349 8.579 < 2e-16 ***
## bs(RIDAGEYR, df = 7)6 8.32671 1.00634 8.274 < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 10.37791 1.02542 10.121 < 2e-16 ***
## bs(BMXWT)1 50.68186 2.18057 23.243 < 2e-16 ***
## bs(BMXWT)2 81.89221 1.80444 45.384 < 2e-16 ***
## bs(BMXWT)3 107.21009 2.90621 36.890 < 2e-16 ***
## RIAGENDRMale 0.53331 0.28350 1.881 0.06007 .
## BMXHT -0.46426 0.01511 -30.731 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.081 on 2487 degrees of freedom
## Multiple R-squared: 0.8978, Adjusted R-squared: 0.8974
## F-statistic: 1822 on 12 and 2487 DF, p-value: < 2.2e-16
## Warning in bs(BMXWT, degree = 3L, knots = numeric(0), Boundary.knots = c(37, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + BMXHT + years")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.3658 -3.3131 -0.0423 3.3434 23.6241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106.384953 2.493452 42.666 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.878036 1.137542 1.651 0.098873 .
## bs(RIDAGEYR, df = 7)2 1.242629 0.814219 1.526 0.127097
## bs(RIDAGEYR, df = 7)3 3.022987 0.876859 3.448 0.000575 ***
## bs(RIDAGEYR, df = 7)4 4.843558 0.767397 6.312 3.26e-10 ***
## bs(RIDAGEYR, df = 7)5 8.449445 0.983821 8.588 < 2e-16 ***
## bs(RIDAGEYR, df = 7)6 8.664630 1.043737 8.302 < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 10.307882 1.073123 9.606 < 2e-16 ***
## BMXWT 0.788480 0.005781 136.381 < 2e-16 ***
## RIAGENDRMale 0.863035 0.290412 2.972 0.002989 **
## BMXHT -0.453963 0.015595 -29.110 < 2e-16 ***
## years2005-2006 -0.035627 0.452201 -0.079 0.937209
## years2007-2008 -0.705950 0.426029 -1.657 0.097636 .
## years2009-2010 -0.142005 0.420688 -0.338 0.735729
## years2013-2014 0.506923 0.433575 1.169 0.242448
## years2015-2016 0.543725 0.446528 1.218 0.223464
## years2017-2018 -0.387545 0.437242 -0.886 0.375519
## years2022-2012 -0.269318 0.437257 -0.616 0.538001
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.221 on 2482 degrees of freedom
## Multiple R-squared: 0.8924, Adjusted R-squared: 0.8916
## F-statistic: 1211 on 17 and 2482 DF, p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + invNorm(BMXWT) + RIAGENDR + BMXHT + years")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.7234 -3.5551 -0.1187 3.4965 22.3705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 170.99494 2.74548 62.282 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.50848 1.18498 1.273 0.20314
## bs(RIDAGEYR, df = 7)2 1.57826 0.84807 1.861 0.06286 .
## bs(RIDAGEYR, df = 7)3 2.55800 0.91360 2.800 0.00515 **
## bs(RIDAGEYR, df = 7)4 4.53068 0.79954 5.667 1.63e-08 ***
## bs(RIDAGEYR, df = 7)5 7.99598 1.02499 7.801 8.97e-15 ***
## bs(RIDAGEYR, df = 7)6 8.25411 1.08726 7.592 4.44e-14 ***
## bs(RIDAGEYR, df = 7)7 10.19473 1.11783 9.120 < 2e-16 ***
## invNorm(BMXWT) 16.30449 0.12525 130.179 < 2e-16 ***
## RIAGENDRMale 0.02846 0.30226 0.094 0.92498
## BMXHT -0.45234 0.01625 -27.832 < 2e-16 ***
## years2005-2006 0.15296 0.47111 0.325 0.74545
## years2007-2008 -0.57111 0.44378 -1.287 0.19824
## years2009-2010 -0.21686 0.43822 -0.495 0.62073
## years2013-2014 0.76372 0.45162 1.691 0.09095 .
## years2015-2016 0.68518 0.46511 1.473 0.14084
## years2017-2018 -0.03682 0.45538 -0.081 0.93556
## years2022-2012 -0.04154 0.45549 -0.091 0.92734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.438 on 2482 degrees of freedom
## Multiple R-squared: 0.8832, Adjusted R-squared: 0.8824
## F-statistic: 1104 on 17 and 2482 DF, p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + invNorm(BMXHT) + years")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.5232 -3.2377 -0.0766 3.4189 24.1107
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.744141 0.790494 38.892 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.849644 1.143505 1.618 0.105893
## bs(RIDAGEYR, df = 7)2 1.177541 0.818511 1.439 0.150379
## bs(RIDAGEYR, df = 7)3 3.045851 0.881457 3.455 0.000559 ***
## bs(RIDAGEYR, df = 7)4 4.792176 0.771517 6.211 6.14e-10 ***
## bs(RIDAGEYR, df = 7)5 8.480503 0.988970 8.575 < 2e-16 ***
## bs(RIDAGEYR, df = 7)6 8.650568 1.049297 8.244 2.66e-16 ***
## bs(RIDAGEYR, df = 7)7 10.183105 1.079167 9.436 < 2e-16 ***
## BMXWT 0.786456 0.005798 135.638 < 2e-16 ***
## RIAGENDRMale 0.597438 0.287836 2.076 0.038032 *
## invNorm(BMXHT) -4.465009 0.156623 -28.508 < 2e-16 ***
## years2005-2006 -0.048951 0.454572 -0.108 0.914253
## years2007-2008 -0.704234 0.428269 -1.644 0.100226
## years2009-2010 -0.161861 0.422936 -0.383 0.701969
## years2013-2014 0.519587 0.435846 1.192 0.233323
## years2015-2016 0.542718 0.448914 1.209 0.226795
## years2017-2018 -0.367735 0.439521 -0.837 0.402858
## years2022-2012 -0.269516 0.439556 -0.613 0.539831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.248 on 2482 degrees of freedom
## Multiple R-squared: 0.8912, Adjusted R-squared: 0.8905
## F-statistic: 1196 on 17 and 2482 DF, p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, df = 7) + invNorm(BMXWT) + RIAGENDR + invNorm(BMXHT) + years")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.5689 -3.5363 -0.0743 3.4987 22.8113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.43371 0.67691 140.983 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.48881 1.18369 1.258 0.20859
## bs(RIDAGEYR, df = 7)2 1.50636 0.84717 1.778 0.07551 .
## bs(RIDAGEYR, df = 7)3 2.57038 0.91261 2.817 0.00489 **
## bs(RIDAGEYR, df = 7)4 4.45665 0.79879 5.579 2.68e-08 ***
## bs(RIDAGEYR, df = 7)5 8.00675 1.02387 7.820 7.74e-15 ***
## bs(RIDAGEYR, df = 7)6 8.20666 1.08618 7.556 5.83e-14 ***
## bs(RIDAGEYR, df = 7)7 10.02741 1.11704 8.977 < 2e-16 ***
## invNorm(BMXWT) 16.30079 0.12500 130.403 < 2e-16 ***
## RIAGENDRMale -0.13253 0.29778 -0.445 0.65631
## invNorm(BMXHT) -4.54159 0.16244 -27.958 < 2e-16 ***
## years2005-2006 0.14484 0.47059 0.308 0.75827
## years2007-2008 -0.57490 0.44330 -1.297 0.19480
## years2009-2010 -0.24576 0.43778 -0.561 0.57460
## years2013-2014 0.76576 0.45113 1.697 0.08974 .
## years2015-2016 0.66445 0.46465 1.430 0.15284
## years2017-2018 -0.03294 0.45488 -0.072 0.94228
## years2022-2012 -0.04578 0.45500 -0.101 0.91987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.432 on 2482 degrees of freedom
## Multiple R-squared: 0.8835, Adjusted R-squared: 0.8827
## F-statistic: 1107 on 17 and 2482 DF, p-value: < 2.2e-16

run_model("BMXWAIST ~ bs(RIDAGEYR, knots = seq(-20,20, by = 10), Boundary.knots = c(-30,100))+ BMXWT + RIAGENDR + years")
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.5033 -4.1369 -0.0908 4.2013 26.3945
##
## Coefficients: (5 not defined because of singularities)
## Estimate
## (Intercept) 51.737411
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5 -14.506591
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6 -13.851904
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7 -3.432224
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8 NA
## BMXWT 0.726199
## RIAGENDRMale -4.712019
## years2005-2006 -0.226083
## years2007-2008 -0.491270
## years2009-2010 0.294104
## years2013-2014 0.988924
## years2015-2016 1.439226
## years2017-2018 0.328845
## years2022-2012 -0.095229
## Std. Error
## (Intercept) 2.307985
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5 3.393350
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6 1.404916
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7 4.484600
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8 NA
## BMXWT 0.006213
## RIAGENDRMale 0.252431
## years2005-2006 0.521963
## years2007-2008 0.488132
## years2009-2010 0.481255
## years2013-2014 0.498128
## years2015-2016 0.510024
## years2017-2018 0.500337
## years2022-2012 0.500430
## t value
## (Intercept) 22.417
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5 -4.275
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6 -9.860
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7 -0.765
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8 NA
## BMXWT 116.877
## RIAGENDRMale -18.667
## years2005-2006 -0.433
## years2007-2008 -1.006
## years2009-2010 0.611
## years2013-2014 1.985
## years2015-2016 2.822
## years2017-2018 0.657
## years2022-2012 -0.190
## Pr(>|t|)
## (Intercept) < 2e-16
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4 NA
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5 1.98e-05
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6 < 2e-16
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7 0.44414
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8 NA
## BMXWT < 2e-16
## RIAGENDRMale < 2e-16
## years2005-2006 0.66495
## years2007-2008 0.31431
## years2009-2010 0.54118
## years2013-2014 0.04722
## years2015-2016 0.00481
## years2017-2018 0.51108
## years2022-2012 0.84909
##
## (Intercept) ***
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))1
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))2
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))3
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))4
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))5 ***
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))6 ***
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))7
## bs(RIDAGEYR, knots = seq(-20, 20, by = 10), Boundary.knots = c(-30, 100))8
## BMXWT ***
## RIAGENDRMale ***
## years2005-2006
## years2007-2008
## years2009-2010
## years2013-2014 *
## years2015-2016 **
## years2017-2018
## years2022-2012
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.042 on 2487 degrees of freedom
## Multiple R-squared: 0.8555, Adjusted R-squared: 0.8549
## F-statistic: 1228 on 12 and 2487 DF, p-value: < 2.2e-16
## Warning in predict.lm(lm_reg, newdata = test_df, se = T): prediction from a
## rank-deficient fit may be misleading

# grid.arrange(g1, g2,g3, nrow=3)
regression models with variables
base_form <- "BMXWAIST ~ bs(RIDAGEYR, df = 7) + BMXWT + RIAGENDR + years + "
# MFA 16:1 (Hexadecenoic) (gm)
run_model(paste0(base_form,"DR1TM161"))
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.1828 -4.0777 -0.1006 4.1779 26.8517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.424802 0.877608 42.644 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.341987 1.315937 1.020 0.307925
## bs(RIDAGEYR, df = 7)2 1.437858 0.942593 1.525 0.127280
## bs(RIDAGEYR, df = 7)3 3.371231 1.014657 3.323 0.000905 ***
## bs(RIDAGEYR, df = 7)4 5.637960 0.887545 6.352 2.51e-10 ***
## bs(RIDAGEYR, df = 7)5 9.188624 1.138373 8.072 1.07e-15 ***
## bs(RIDAGEYR, df = 7)6 10.281208 1.206022 8.525 < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 12.360987 1.240218 9.967 < 2e-16 ***
## BMXWT 0.728377 0.006291 115.780 < 2e-16 ***
## RIAGENDRMale -4.594644 0.257893 -17.816 < 2e-16 ***
## years2005-2006 -0.233204 0.523185 -0.446 0.655824
## years2007-2008 -0.502333 0.493486 -1.018 0.308812
## years2009-2010 0.269400 0.487139 0.553 0.580297
## years2013-2014 0.933972 0.502936 1.857 0.063424 .
## years2015-2016 1.431335 0.515918 2.774 0.005573 **
## years2017-2018 0.284140 0.506228 0.561 0.574652
## years2022-2012 -0.144546 0.507571 -0.285 0.775837
## DR1TM161 -0.338338 0.147945 -2.287 0.022285 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.04 on 2482 degrees of freedom
## Multiple R-squared: 0.8559, Adjusted R-squared: 0.8549
## F-statistic: 867.4 on 17 and 2482 DF, p-value: < 2.2e-16

hist(data$DR1TM161)

hist(invNorm(data$DR1TM161))

run_model(paste0(base_form,"invNorm(DR1TM161)"))
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.5429 -4.0934 -0.1196 4.1601 26.9842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.962548 0.874557 42.264 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.413289 1.315219 1.075 0.282674
## bs(RIDAGEYR, df = 7)2 1.411971 0.941478 1.500 0.133809
## bs(RIDAGEYR, df = 7)3 3.436053 1.014408 3.387 0.000717 ***
## bs(RIDAGEYR, df = 7)4 5.617117 0.886936 6.333 2.84e-10 ***
## bs(RIDAGEYR, df = 7)5 9.215022 1.137119 8.104 8.27e-16 ***
## bs(RIDAGEYR, df = 7)6 10.270878 1.205089 8.523 < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 12.344797 1.238837 9.965 < 2e-16 ***
## BMXWT 0.728918 0.006279 116.085 < 2e-16 ***
## RIAGENDRMale -4.563409 0.257370 -17.731 < 2e-16 ***
## years2005-2006 -0.223296 0.522759 -0.427 0.669308
## years2007-2008 -0.503376 0.492857 -1.021 0.307191
## years2009-2010 0.259190 0.486626 0.533 0.594340
## years2013-2014 0.917966 0.502233 1.828 0.067704 .
## years2015-2016 1.435866 0.515210 2.787 0.005361 **
## years2017-2018 0.269099 0.505649 0.532 0.594645
## years2022-2012 -0.164762 0.506963 -0.325 0.745209
## invNorm(DR1TM161) -0.383826 0.128875 -2.978 0.002927 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.036 on 2482 degrees of freedom
## Multiple R-squared: 0.8561, Adjusted R-squared: 0.8552
## F-statistic: 868.9 on 17 and 2482 DF, p-value: < 2.2e-16

# Dietary day one sample weight
run_model(paste0(base_form,"WTDRD1"))
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.7615 -4.0387 -0.0509 4.1106 27.2154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.805e+01 8.745e-01 43.515 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.365e+00 1.306e+00 1.045 0.29607
## bs(RIDAGEYR, df = 7)2 1.300e+00 9.347e-01 1.391 0.16430
## bs(RIDAGEYR, df = 7)3 3.252e+00 1.007e+00 3.230 0.00125 **
## bs(RIDAGEYR, df = 7)4 5.849e+00 8.806e-01 6.642 3.79e-11 ***
## bs(RIDAGEYR, df = 7)5 8.792e+00 1.131e+00 7.772 1.12e-14 ***
## bs(RIDAGEYR, df = 7)6 1.035e+01 1.196e+00 8.649 < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 1.188e+01 1.232e+00 9.639 < 2e-16 ***
## BMXWT 7.262e-01 6.164e-03 117.808 < 2e-16 ***
## RIAGENDRMale -4.736e+00 2.505e-01 -18.903 < 2e-16 ***
## years2005-2006 -1.084e-01 5.193e-01 -0.209 0.83468
## years2007-2008 -5.364e-01 4.891e-01 -1.097 0.27289
## years2009-2010 2.217e-01 4.828e-01 0.459 0.64611
## years2013-2014 1.018e+00 4.973e-01 2.046 0.04082 *
## years2015-2016 1.570e+00 5.114e-01 3.069 0.00217 **
## years2017-2018 4.611e-01 5.013e-01 0.920 0.35776
## years2022-2012 -6.032e-03 5.019e-01 -0.012 0.99041
## WTDRD1 -1.761e-05 2.638e-06 -6.675 3.03e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.993 on 2482 degrees of freedom
## Multiple R-squared: 0.8582, Adjusted R-squared: 0.8572
## F-statistic: 883.4 on 17 and 2482 DF, p-value: < 2.2e-16

# BMXARML - Upper Arm Length (cm)
run_model(paste0(base_form,"BMXARML"))
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.3393 -3.7212 -0.1236 3.7381 23.8669
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.372041 2.025362 33.758 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.447563 1.247802 1.160 0.246124
## bs(RIDAGEYR, df = 7)2 1.242831 0.893264 1.391 0.164248
## bs(RIDAGEYR, df = 7)3 3.447961 0.961929 3.584 0.000344 ***
## bs(RIDAGEYR, df = 7)4 5.707560 0.841272 6.784 1.45e-11 ***
## bs(RIDAGEYR, df = 7)5 9.658422 1.079108 8.950 < 2e-16 ***
## bs(RIDAGEYR, df = 7)6 10.779565 1.143595 9.426 < 2e-16 ***
## bs(RIDAGEYR, df = 7)7 13.378958 1.175477 11.382 < 2e-16 ***
## BMXWT 0.793315 0.007112 111.540 < 2e-16 ***
## RIAGENDRMale -2.248113 0.280564 -8.013 1.71e-15 ***
## years2005-2006 -0.048946 0.496145 -0.099 0.921422
## years2007-2008 -0.504109 0.467288 -1.079 0.280783
## years2009-2010 -0.448991 0.463496 -0.969 0.332786
## years2013-2014 0.526270 0.476186 1.105 0.269189
## years2015-2016 0.959409 0.489582 1.960 0.050149 .
## years2017-2018 -0.160745 0.479855 -0.335 0.737663
## years2022-2012 -0.517486 0.480432 -1.077 0.281528
## BMXARML -1.006012 0.059660 -16.862 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.727 on 2482 degrees of freedom
## Multiple R-squared: 0.8705, Adjusted R-squared: 0.8696
## F-statistic: 981.1 on 17 and 2482 DF, p-value: < 2.2e-16

#BMXLEG - Upper Leg Length (cm)
run_model(paste0(base_form,"BMXLEG"))
##
## Call:
## lm(formula = as.formula(formula_str), data = train_data_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.8445 -3.6406 -0.1098 3.5715 26.8938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.257407 1.515146 46.370 < 2e-16 ***
## bs(RIDAGEYR, df = 7)1 1.251076 1.173380 1.066 0.286430
## bs(RIDAGEYR, df = 7)2 0.692536 0.840382 0.824 0.409977
## bs(RIDAGEYR, df = 7)3 2.234994 0.905550 2.468 0.013650 *
## bs(RIDAGEYR, df = 7)4 3.386435 0.796288 4.253 2.19e-05 ***
## bs(RIDAGEYR, df = 7)5 7.291548 1.017514 7.166 1.01e-12 ***
## bs(RIDAGEYR, df = 7)6 6.826024 1.083971 6.297 3.57e-10 ***
## bs(RIDAGEYR, df = 7)7 9.794273 1.109475 8.828 < 2e-16 ***
## BMXWT 0.748006 0.005606 133.423 < 2e-16 ***
## RIAGENDRMale -1.220653 0.263772 -4.628 3.89e-06 ***
## years2005-2006 -1.368641 0.468658 -2.920 0.003528 **
## years2007-2008 -2.100519 0.444221 -4.729 2.39e-06 ***
## years2009-2010 -1.589674 0.440212 -3.611 0.000311 ***
## years2013-2014 -0.307542 0.449992 -0.683 0.494393
## years2015-2016 -0.225389 0.464366 -0.485 0.627458
## years2017-2018 -0.739519 0.452377 -1.635 0.102230
## years2022-2012 -1.611087 0.455193 -3.539 0.000408 ***
## BMXLEG -0.868298 0.034154 -25.423 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.386 on 2482 degrees of freedom
## Multiple R-squared: 0.8855, Adjusted R-squared: 0.8847
## F-statistic: 1129 on 17 and 2482 DF, p-value: < 2.2e-16

# grid.arrange(g1, g2,g3, nrow=3)